Load raw data, annotate probes using biomaRt, filter probes and load SFARI genes

Filtering criteria:

if(!file.exists('./../Data/Gandal_RNASeq.RData')){
  
  # Load csvs
  datExpr = read.csv('./../../Gandal/RNAseq/raw_data/RNAseq_ASD_datExpr.csv', row.names=1)
  datMeta = read.csv('./../../Gandal/RNAseq/raw_data/RNAseq_ASD_datMeta.csv')
  SFARI_genes = read_csv('./../../Gandal/RNAseq/working_data/SFARI_genes_01-15-2019.csv')
  
  # Make sure datExpr and datMeta columns/rows match
  rownames(datMeta) = paste0('X', datMeta$Dissected_Sample_ID)
  if(!all(colnames(datExpr) == rownames(datMeta))){
    print('Columns in datExpr don\'t match the rowd in datMeta!')
  }
  
  # Annotate probes
  getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
              'end_position','strand','band','gene_biotype','percentage_gc_content')
  mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
                 dataset='hsapiens_gene_ensembl',
                 host='feb2014.archive.ensembl.org') ## Gencode v19
  datProbes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
  datProbes = datProbes[match(rownames(datExpr), datProbes$ensembl_gene_id),]
  datProbes$length = datProbes$end_position-datProbes$start_position
  
  # Group brain regions by lobes
  datMeta$Brain_Region = as.factor(datMeta$Region)
  datMeta$Brain_lobe = 'Occipital'
  datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA4_6', 'BA9', 'BA24', 'BA44_45')] = 'Frontal'
  datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA3_1_2_5', 'BA7')] = 'Parietal'
  datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA38', 'BA39_40', 'BA20_37', 'BA41_42_22')] = 'Temporal'
  datMeta$Brain_lobe=factor(datMeta$Brain_lobe, levels=c('Frontal', 'Temporal', 'Parietal', 'Occipital'))
  
  #################################################################################
  # FILTERS:
  
  # 1 Filter probes with start or end position missing (filter 5)
  # These can be filtered without probe info, they have weird IDS that don't start with ENS
  to_keep = !is.na(datProbes$length)
  datProbes = datProbes[to_keep,]
  datExpr = datExpr[to_keep,]
  rownames(datProbes) = datProbes$ensembl_gene_id
  
  # 2. Filter samples from ID AN03345 (filter 2)
  to_keep = (datMeta$Subject_ID != 'AN03345')
  datMeta = datMeta[to_keep,]
  datExpr = datExpr[,to_keep]
  
  # 3. Filter samples with rowSums <= 40 (at least half of the samples without any expression)
  to_keep = rowSums(datExpr)>40
  datExpr = datExpr[to_keep,]
  datProbes = datProbes[to_keep,]

  #################################################################################
  # DEA using DESeq2
  counts = as.matrix(datExpr)
  rowRanges = GRanges(datProbes$chromosome_name,
                      IRanges(datProbes$start_position, width=datProbes$length),
                      strand=datProbes$strand,
                      feature_id=datProbes$ensembl_gene_id)
  
  se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
  ddsSE = DESeqDataSet(se, design =~Diagnosis_)
  
  dds = DESeq(ddsSE)
  DE_info = results(dds) %>% data.frame %>% rownames_to_column(var = 'ID') %>%
            mutate('logFC'=log2FoldChange, 'adj.P.Val'=padj) %>% 
            dplyr::select(-log2FoldChange, -padj)
  
  #################################################################################
  # Annotate SFARI genes
  
  # Get ensemble IDS for SFARI genes
  mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
                 dataset='hsapiens_gene_ensembl',
                 host='feb2014.archive.ensembl.org') ## Gencode v19
  
  gene_names = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('hgnc_symbol'), 
                     values=SFARI_genes$`gene-symbol`, mart=mart) %>% 
                     mutate('gene-symbol'=hgnc_symbol, 'ID'=as.character(ensembl_gene_id)) %>% 
                     dplyr::select('ID', 'gene-symbol')
  
  SFARI_genes = left_join(SFARI_genes, gene_names, by='gene-symbol')
  
  #################################################################################
  # Add functional annotation to genes from GO
  
  GO_annotations = read.csv('./../../Gandal/RNAseq/working_data/genes_GO_annotations.csv')
  GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
                mutate('ID' = as.character(ensembl_gene_id)) %>% 
                dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
                mutate('Neuronal' = 1)
  
  save(datExpr, datMeta, datProbes, GO_neuronal, SFARI_genes, DE_info, file='./../Data/Gandal_RNASeq.RData')
  
  rm(getinfo, to_keep, gene_names, mart, counts, dds, ddsSE, GO_annotations, rowRanges, se)
  
} else load('./../Data/Gandal_RNASeq.RData')

datExpr_backup = datExpr

Number of genes and samples:

dim(datExpr)
## [1] 33478    86


Gene count by SFARI score of remaining genes:

table(SFARI_genes$`gene-score`)
## 
##   1   2   3   4   5   6 
##  29  82 209 538 191  25


Gene count by Diagnosis and Brain lobe:

t(table(datMeta$Brain_lobe, datMeta$Diagnosis_))
##      
##       Frontal Temporal Parietal Occipital
##   ASD       8       14       14        15
##   CTL      13        6        8         8


Relation between SFARI scores and Neuronal functional annotation:

Neuronal_SFARI = data.frame('ID'=rownames(datExpr), 'Neuronal'=rownames(datExpr) %in% GO_neuronal$ID) %>%
                 left_join(SFARI_genes, by='ID')

tbl = table(Neuronal_SFARI$`gene-score`, Neuronal_SFARI$Neuronal, useNA='ifany') %>% t %>% as.data.frame.matrix
rownames(tbl) = c('Non-Neuronal','Neuronal')
tbl
##               1  2   3   4   5  6    NA
## Non-Neuronal 15 44 146 359 129 20 31601
## Neuronal      9 16  42  74  38  4   982
tbl = round(sweep(tbl, 2, colSums(tbl), `/`)*100, 2)
tbl
##                 1     2     3     4     5     6    NA
## Non-Neuronal 62.5 73.33 77.66 82.91 77.25 83.33 96.99
## Neuronal     37.5 26.67 22.34 17.09 22.75 16.67  3.01
rm(Neuronal_SFARI, tbl)



Boxplots for raw data

Mean and Standard Deviation by score

  • The higher the SFARI score, the higher the mean and the standard deviation except for the sd of SFARI scores 4 and 5
make_ASD_vs_CTL_df = function(datExpr){
  datExpr_ASD = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_=='ASD'))
  datExpr_CTL = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_!='ASD'))
  
  ASD_vs_CTL = data.frame('ID'=as.character(rownames(datExpr)),
                          'mean' = rowMeans(datExpr), 'sd' = apply(datExpr, 1, sd),
                          'mean_ASD'=rowMeans(datExpr_ASD), 'mean_CTL'=rowMeans(datExpr_CTL),
                          'sd_ASD'=apply(datExpr_ASD,1,sd), 'sd_CTL'=apply(datExpr_CTL,1,sd)) %>%
               mutate('mean_diff'=mean_ASD-mean_CTL, 'sd_diff'=sd_ASD-sd_CTL) %>%
               left_join(SFARI_genes, by='ID') %>%
               dplyr::select(ID, mean, sd, mean_ASD, mean_CTL, mean_diff, sd_ASD, sd_CTL, sd_diff, `gene-score`) %>%
               mutate('Neuronal'=ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Non-Neuronal'))
  
  ASD_vs_CTL_SFARI = ASD_vs_CTL %>% filter(!is.na(`gene-score`)) %>% 
                     mutate(Group = as.character(`gene-score`)) %>% dplyr::select(-c(Neuronal,`gene-score`))
  ASD_vs_CTL_Neuro = ASD_vs_CTL %>% mutate(Group = Neuronal) %>% dplyr::select(-c(Neuronal,`gene-score`))
  ASD_vs_CTL_all = ASD_vs_CTL %>% mutate(Group = 'All') %>% dplyr::select(-c(Neuronal,`gene-score`))
  
  ASD_vs_CTL_together = bind_rows(ASD_vs_CTL_SFARI, ASD_vs_CTL_Neuro, ASD_vs_CTL_all) %>% 
                        mutate(Group = ordered(Group, levels=c('1','2','3','4','5','6',
                                                               'Neuronal','Non-Neuronal','All'))) %>%
                        left_join(DE_info, by='ID')
  
  return(ASD_vs_CTL_together)
}

ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr)

p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, mean, fill=Group)) + 
              geom_boxplot() + theme_minimal() + 
              ggtitle('Mean (left) and Standard Deviation (right) by score') +
              scale_fill_manual(values=gg_colour_hue(9)) +
              theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, sd, fill=Group)) + 
              geom_boxplot() + theme_minimal() +
              ggtitle('Mean (left) and Standard Deviation (right) by score') +
              scale_fill_manual(values=gg_colour_hue(9)) +
              theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

subplot(p1, p2, nrows=1)
rm(p1, p2)

There is heterocedasticity in the data, but the variance increases at a lower rate than the mean, so the \(log_2\) transformation may affect the variance

ggplot(ASD_vs_CTL, aes(mean+1, sd+1)) + geom_point(alpha=0.1, aes(id=ID, fill=Group, color=Group)) +
         scale_color_manual(values=gg_colour_hue(9)) + scale_fill_manual(values=gg_colour_hue(9)) +  
         scale_x_continuous(trans='log2') + scale_y_continuous(trans='log2') + 
           ggtitle(paste0('Corr=',round(cor(ASD_vs_CTL$mean, ASD_vs_CTL$sd), 2))) +
         geom_smooth(method=lm, se=FALSE, color='#808080', size=0.5) + theme_minimal()

Difference between Diagnosis by score

  • The higher the SFARI score, the bigger the difference between the level of expression between Autism and Control

  • The higher the SFARI score, the smaller the log Fold Change between Autism and Control

p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(mean_diff), fill=Group)) + 
                geom_boxplot() + theme_minimal() + 
                ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
                scale_fill_manual(values=gg_colour_hue(9)) +
                theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(logFC), fill=Group)) + 
                geom_boxplot() + theme_minimal() +
                ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
                scale_fill_manual(values=gg_colour_hue(9)) +
                theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

subplot(p1, p2, nrows=1)
rm(p1, p2)

Boxplots for log transformed data

Mean and Standard deviation by score

  • The higher the SFARI score, the higher the mean but the lower the standard deviation (!)
ASD_vs_CTL = make_ASD_vs_CTL_df(log2(datExpr+1))

p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, mean, fill=Group)) + 
              geom_boxplot() + theme_minimal() +
              scale_fill_manual(values=gg_colour_hue(9)) +
              theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, sd, fill=Group)) + 
              geom_boxplot() + theme_minimal() +
              scale_fill_manual(values=gg_colour_hue(9)) +
              theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

subplot(p1, p2, nrows=1)
rm(p1, p2)

There is still heterocedasticity in the data, since the variance increased at a lower rate than the mean, by applying \(log_2\) transformation to the data, the relation between mean and variance inverted

ggplot(ASD_vs_CTL, aes(mean, sd)) + geom_point(alpha=0.1, aes(id=ID, fill=Group, color=Group)) +
         scale_color_manual(values=gg_colour_hue(9)) + scale_fill_manual(values=gg_colour_hue(9)) +  
           ggtitle(paste0('Corr=',round(cor(ASD_vs_CTL$mean, ASD_vs_CTL$sd), 2))) +
         geom_smooth(method=lm, se=FALSE, color='#808080', size=0.5) + theme_minimal()

Difference between Diagnosis by score

  • The higher the SFARI score, the smaller the difference between the level of expression between Autism and Control

  • The second plot is the same as with the raw data, but this time it agrees with the left plot

p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(mean_diff), fill=Group)) + 
                geom_boxplot() + theme_minimal() + 
                ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
                scale_fill_manual(values=gg_colour_hue(9)) +
                theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(logFC), fill=Group)) + 
                geom_boxplot() + theme_minimal() +
                ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
                scale_fill_manual(values=gg_colour_hue(9)) +
                theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

subplot(p1, p2, nrows=1)
rm(p1, p2)

Boxplots for vst normalised data

Mean and Standard deviation by score

  • The higher the SFARI score, the higher the mean but the lower the standard deviation (!)
counts = as.matrix(datExpr)
rowRanges = GRanges(datProbes$chromosome_name,
                    IRanges(datProbes$start_position, width=datProbes$length),
                    strand=datProbes$strand,
                    feature_id=datProbes$ensembl_gene_id)

se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
dds = DESeqDataSet(se, design =~Diagnosis_)
dds = estimateSizeFactors(dds)
vst_output = vst(dds)
datExpr_vst = assay(vst_output)

ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr_vst)

p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, mean, fill=Group)) + 
              geom_boxplot() + theme_minimal() +
              scale_fill_manual(values=gg_colour_hue(9)) +
              theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, sd, fill=Group)) + 
              geom_boxplot() + theme_minimal() +
              scale_fill_manual(values=gg_colour_hue(9)) +
              theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

subplot(p1, p2, nrows=1)
rm(p1, p2)
ggplot(ASD_vs_CTL, aes(mean, sd)) + geom_point(alpha=0.1, aes(id=ID, fill=Group, color=Group)) +
         scale_color_manual(values=gg_colour_hue(9)) + scale_fill_manual(values=gg_colour_hue(9)) +  
           ggtitle(paste0('Corr=',round(cor(ASD_vs_CTL$mean, ASD_vs_CTL$sd), 2))) +
         geom_smooth(method=lm, se=FALSE, color='#808080', size=0.5) + theme_minimal()

Difference between Diagnosis by score

  • The higher the SFARI score, the smaller the difference between the level of expression between Autism and Control

  • The second plot is the same as with the raw data, but this time it agrees with the left plot

p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(mean_diff), fill=Group)) + 
                geom_boxplot() + theme_minimal() + 
                ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
                scale_fill_manual(values=gg_colour_hue(9)) +
                theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(logFC), fill=Group)) + 
                geom_boxplot() + theme_minimal() +
                ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
                scale_fill_manual(values=gg_colour_hue(9)) +
                theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

subplot(p1, p2, nrows=1)
rm(p1, p2)

Conclusion